# General
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.6
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(skimr)
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
# date
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
#track time
#devtools::install_github("jabiru/tictoc")
library(tictoc)
#install.packages("Hmisc")
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
loans <- read_csv("Loans.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## loan_amnt = col_integer(),
## funded_amnt = col_integer(),
## installment = col_double(),
## annual_inc = col_double(),
## dti = col_double(),
## delinq_2yrs = col_integer(),
## inq_last_6mths = col_integer(),
## mths_since_last_delinq = col_integer(),
## mths_since_last_record = col_integer(),
## open_acc = col_integer(),
## pub_rec = col_integer(),
## revol_bal = col_integer(),
## revol_util = col_double(),
## total_acc = col_integer(),
## policy_code = col_integer(),
## loan_defaulted = col_integer()
## )
## See spec(...) for full column specifications.
## cols(
## loan_amnt = col_integer(),
## funded_amnt = col_integer(),
## term = col_character(),
## int_rate = col_character(),
## installment = col_double(),
## grade = col_character(),
## sub_grade = col_character(),
## emp_title = col_character(),
## emp_length = col_character(),
## home_ownership = col_character(),
## annual_inc = col_double(),
## verification_status = col_character(),
## pymnt_plan = col_character(),
## desc = col_character(),
## purpose = col_character(),
## title = col_character(),
## zip_code = col_character(),
## addr_state = col_character(),
## dti = col_double(),
## delinq_2yrs = col_integer(),
## earliest_cr_line = col_character(),
## inq_last_6mths = col_integer(),
## mths_since_last_delinq = col_integer(),
## mths_since_last_record = col_integer(),
## open_acc = col_integer(),
## pub_rec = col_integer(),
## revol_bal = col_integer(),
## revol_util = col_double(),
## total_acc = col_integer(),
## initial_list_status = col_character(),
## mths_since_last_major_derog = col_character(),
## policy_code = col_integer(),
## loan_defaulted = col_integer()
## )
## # A tibble: 6 x 33
## loan_amnt funded_amnt term int_rate installment grade sub_grade
## <int> <int> <chr> <chr> <dbl> <chr> <chr>
## 1 4000 4000 60 m… 7.29% 79.8 A A4
## 2 16000 16000 60 m… 18.25% 408. F F1
## 3 8700 8700 36 m… 7.88% 272. A A5
## 4 18000 18000 60 m… 11.49% 396. B B4
## 5 16000 16000 36 m… 11.83% 530. B B3
## 6 3000 3000 60 m… 15.58% 72.3 D D3
## # ... with 26 more variables: emp_title <chr>, emp_length <chr>,
## # home_ownership <chr>, annual_inc <dbl>, verification_status <chr>,
## # pymnt_plan <chr>, desc <chr>, purpose <chr>, title <chr>,
## # zip_code <chr>, addr_state <chr>, dti <dbl>, delinq_2yrs <int>,
## # earliest_cr_line <chr>, inq_last_6mths <int>,
## # mths_since_last_delinq <int>, mths_since_last_record <int>,
## # open_acc <int>, pub_rec <int>, revol_bal <int>, revol_util <dbl>,
## # total_acc <int>, initial_list_status <chr>,
## # mths_since_last_major_derog <chr>, policy_code <int>,
## # loan_defaulted <int>
## loan_amnt funded_amnt term int_rate
## Min. : 500 Min. : 500 Length:10000 Length:10000
## 1st Qu.: 5000 1st Qu.: 5000 Class :character Class :character
## Median : 9600 Median : 9250 Mode :character Mode :character
## Mean :11036 Mean :10766
## 3rd Qu.:15000 3rd Qu.:15000
## Max. :35000 Max. :35000
##
## installment grade sub_grade
## Min. : 15.69 Length:10000 Length:10000
## 1st Qu.: 163.47 Class :character Class :character
## Median : 275.11 Mode :character Mode :character
## Mean : 321.43
## 3rd Qu.: 427.04
## Max. :1276.60
##
## emp_title emp_length home_ownership annual_inc
## Length:10000 Length:10000 Length:10000 Min. : 2000
## Class :character Class :character Class :character 1st Qu.: 40000
## Mode :character Mode :character Mode :character Median : 58000
## Mean : 68203
## 3rd Qu.: 82000
## Max. :900000
## NA's :1
## verification_status pymnt_plan desc
## Length:10000 Length:10000 Length:10000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## purpose title zip_code
## Length:10000 Length:10000 Length:10000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## addr_state dti delinq_2yrs earliest_cr_line
## Length:10000 Min. : 0.00 Min. : 0.0000 Length:10000
## Class :character 1st Qu.: 8.16 1st Qu.: 0.0000 Class :character
## Mode :character Median :13.41 Median : 0.0000 Mode :character
## Mean :13.34 Mean : 0.1482
## 3rd Qu.:18.69 3rd Qu.: 0.0000
## Max. :29.99 Max. :11.0000
## NA's :5
## inq_last_6mths mths_since_last_delinq mths_since_last_record
## Min. : 0.000 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 18.00 1st Qu.: 0.00
## Median : 1.000 Median : 34.00 Median : 86.00
## Mean : 1.067 Mean : 35.89 Mean : 61.65
## 3rd Qu.: 2.000 3rd Qu.: 53.00 3rd Qu.:101.00
## Max. :25.000 Max. :120.00 Max. :119.00
## NA's :5 NA's :6316 NA's :9160
## open_acc pub_rec revol_bal revol_util
## Min. : 1.000 Min. :0.00000 Min. : 0 Min. : 0.00
## 1st Qu.: 6.000 1st Qu.:0.00000 1st Qu.: 3524 1st Qu.: 25.00
## Median : 9.000 Median :0.00000 Median : 8646 Median : 48.70
## Mean : 9.335 Mean :0.06013 Mean : 14271 Mean : 48.47
## 3rd Qu.:12.000 3rd Qu.:0.00000 3rd Qu.: 16952 3rd Qu.: 71.90
## Max. :39.000 Max. :3.00000 Max. :1207359 Max. :108.80
## NA's :5 NA's :5 NA's :23
## total_acc initial_list_status mths_since_last_major_derog
## Min. : 1.00 Length:10000 Length:10000
## 1st Qu.:13.00 Class :character Class :character
## Median :20.00 Mode :character Mode :character
## Mean :22.09
## 3rd Qu.:29.00
## Max. :90.00
## NA's :5
## policy_code loan_defaulted
## Min. :1 Min. :0.0000
## 1st Qu.:1 1st Qu.:0.0000
## Median :1 Median :0.0000
## Mean :1 Mean :0.1295
## 3rd Qu.:1 3rd Qu.:0.0000
## Max. :1 Max. :1.0000
##
## # A tibble: 2 x 2
## loan_defaulted count
## <int> <int>
## 1 0 8705
## 2 1 1295
## # A tibble: 1 x 1
## rate
## <dbl>
## 1 0.130
sum(is.na(loans))/(nrow(loans)*ncol(loans))
## [1] 0.08894242
missing_character_values_tbl <- loans %>%
select_if(is.character) %>%
map_df(~ is.na(.) %>% sum()) %>%
gather() %>%
arrange(desc(value)) %>%
mutate(key = as_factor(key))
missing_character_values_tbl
## # A tibble: 17 x 2
## key value
## <fct> <int>
## 1 mths_since_last_major_derog 10000
## 2 desc 3229
## 3 emp_title 589
## 4 earliest_cr_line 5
## 5 title 3
## 6 term 0
## 7 int_rate 0
## 8 grade 0
## 9 sub_grade 0
## 10 emp_length 0
## 11 home_ownership 0
## 12 verification_status 0
## 13 pymnt_plan 0
## 14 purpose 0
## 15 zip_code 0
## 16 addr_state 0
## 17 initial_list_status 0
missing_numeric_values_tbl <- loans %>%
select_if(is.numeric) %>%
map_df(~ is.na(.) %>% sum()) %>%
gather() %>%
arrange(desc(value)) %>%
mutate(key = as_factor(key))
missing_numeric_values_tbl
## # A tibble: 16 x 2
## key value
## <fct> <int>
## 1 mths_since_last_record 9160
## 2 mths_since_last_delinq 6316
## 3 revol_util 23
## 4 delinq_2yrs 5
## 5 inq_last_6mths 5
## 6 open_acc 5
## 7 pub_rec 5
## 8 total_acc 5
## 9 annual_inc 1
## 10 loan_amnt 0
## 11 funded_amnt 0
## 12 installment 0
## 13 dti 0
## 14 revol_bal 0
## 15 policy_code 0
## 16 loan_defaulted 0
loans%>%filter(is.na(annual_inc))
## # A tibble: 1 x 33
## loan_amnt funded_amnt term int_rate installment grade sub_grade
## <int> <int> <chr> <chr> <dbl> <chr> <chr>
## 1 6700 6700 36 m… 7.75% 209. A A3
## # ... with 26 more variables: emp_title <chr>, emp_length <chr>,
## # home_ownership <chr>, annual_inc <dbl>, verification_status <chr>,
## # pymnt_plan <chr>, desc <chr>, purpose <chr>, title <chr>,
## # zip_code <chr>, addr_state <chr>, dti <dbl>, delinq_2yrs <int>,
## # earliest_cr_line <chr>, inq_last_6mths <int>,
## # mths_since_last_delinq <int>, mths_since_last_record <int>,
## # open_acc <int>, pub_rec <int>, revol_bal <int>, revol_util <dbl>,
## # total_acc <int>, initial_list_status <chr>,
## # mths_since_last_major_derog <chr>, policy_code <int>,
## # loan_defaulted <int>
I have seen some cases where the identified puprose category does not match the desc. A proper deep dive into desc text in “Phase 2 For now will use purpose feature in modelling as I suspect the vast majority of cases would be accurate (80/20 approach)”
loans %>% select( purpose, title,desc)%>% unique()%>% arrange(purpose) %>%group_by(purpose)%>%top_n(10)
## Selecting by desc
## # A tibble: 140 x 3
## # Groups: purpose [14]
## purpose title desc
## <chr> <chr> <chr>
## 1 car FICO score 762 want's … This loan would be for a 2006 PT Cruis…
## 2 car purchase work pickup t… This loan would purchase a small picku…
## 3 car personal buying a car for my son
## 4 car Car Payment To pay off car loan with high interest…
## 5 car Toby's Truck We are helping our son purchase a truc…
## 6 car Truck Loan This loan request is for a fantastic f…
## 7 car vehicle loan a loan to purchase a vehicle
## 8 car Loan Would like to try to purchase a new ca…
## 9 car Debt Consolidation Trying to consolidate all my loans. Th…
## 10 car Purchase car for daugh… Trying to purchase 2001 Honda Accord. …
## # ... with 130 more rows
loans %>% select(emp_title)%>% group_by(emp_title)%>% summarise(count=n())%>%arrange(desc(count))
## # A tibble: 8,185 x 2
## emp_title count
## <chr> <int>
## 1 <NA> 589
## 2 US Army 37
## 3 Bank of America 23
## 4 IBM 22
## 5 US Navy 17
## 6 USAF 17
## 7 United States Air Force 16
## 8 Wells Fargo 15
## 9 AT&T 14
## 10 Self Employed 14
## # ... with 8,175 more rows
loans %>% select_if(is.character) %>% names()
## [1] "term" "int_rate"
## [3] "grade" "sub_grade"
## [5] "emp_title" "emp_length"
## [7] "home_ownership" "verification_status"
## [9] "pymnt_plan" "desc"
## [11] "purpose" "title"
## [13] "zip_code" "addr_state"
## [15] "earliest_cr_line" "initial_list_status"
## [17] "mths_since_last_major_derog"
unique_character_values_tbl <- loans %>%
select_if(is.character) %>%
map_df(~ unique(.) %>% length()) %>%
gather() %>%
arrange(value) %>%
mutate(key = as_factor(key))
unique_character_values_tbl
## # A tibble: 17 x 2
## key value
## <fct> <int>
## 1 initial_list_status 1
## 2 mths_since_last_major_derog 1
## 3 term 2
## 4 pymnt_plan 2
## 5 verification_status 3
## 6 home_ownership 5
## 7 grade 7
## 8 emp_length 12
## 9 purpose 14
## 10 sub_grade 35
## 11 addr_state 50
## 12 int_rate 359
## 13 earliest_cr_line 464
## 14 zip_code 720
## 15 title 5678
## 16 desc 6763
## 17 emp_title 8185
loans %>% select_if(is.numeric) %>% names()
## [1] "loan_amnt" "funded_amnt"
## [3] "installment" "annual_inc"
## [5] "dti" "delinq_2yrs"
## [7] "inq_last_6mths" "mths_since_last_delinq"
## [9] "mths_since_last_record" "open_acc"
## [11] "pub_rec" "revol_bal"
## [13] "revol_util" "total_acc"
## [15] "policy_code" "loan_defaulted"
unique_numeric_values_tbl <- loans %>%
select_if(is.numeric) %>%
map_df(~ unique(.) %>% length()) %>%
gather() %>%
arrange(value) %>%
mutate(key = as_factor(key))
unique_numeric_values_tbl
## # A tibble: 16 x 2
## key value
## <fct> <int>
## 1 policy_code 1
## 2 loan_defaulted 2
## 3 pub_rec 5
## 4 delinq_2yrs 11
## 5 inq_last_6mths 21
## 6 open_acc 37
## 7 total_acc 75
## 8 mths_since_last_delinq 92
## 9 mths_since_last_record 95
## 10 loan_amnt 573
## 11 funded_amnt 727
## 12 revol_util 1031
## 13 annual_inc 1902
## 14 dti 2585
## 15 installment 6381
## 16 revol_bal 8130
Note: for analysis purposes mean value imputation is being completed pre-modelling, however, going forward best practice is to incorporate during modelling based on n-cross validation to ensure robust modelling outcomes and minimising and leakage and/or overtraining.
replace_with_0 <- c("mths_since_last_delinq","delinq_2yrs","inq_last_6mths","open_acc","pub_rec","total_acc")
# Replace missing values with 0
for (i in seq_along(replace_with_0)){
loans01[is.na(loans01[,replace_with_0[i]]), replace_with_0[i]] <- 0
}
#Replace missing with mean value
mean_revol_util<-loans01 %>% select_("revol_util") %>% summarise(mean(revol_util,na.rm = TRUE))
mean_annual_inc <- loans01 %>% select_("annual_inc") %>% summarise(mean(annual_inc,na.rm = TRUE))
loans01 %>% filter(revol_bal==0)
## # A tibble: 278 x 26
## loan_amnt funded_amnt term int_rate installment grade sub_grade
## <int> <int> <chr> <chr> <dbl> <chr> <chr>
## 1 8000 8000 36 m… 14.61% 276. D D1
## 2 7800 7800 60 m… 10.00% 166. B B2
## 3 16750 16750 36 m… 12.53% 561. B B5
## 4 2750 2750 36 m… 16.95% 98.0 E E5
## 5 6450 6450 36 m… 5.99% 196. A A2
## 6 7000 7000 36 m… 7.51% 218. A A3
## 7 15000 15000 60 m… 21.28% 408. F F2
## 8 5000 5000 36 m… 13.57% 170. C C3
## 9 7000 7000 36 m… 6.99% 216. A A3
## 10 1600 1600 36 m… 14.42% 55.0 D D2
## # ... with 268 more rows, and 19 more variables: emp_length <chr>,
## # home_ownership <chr>, annual_inc <dbl>, verification_status <chr>,
## # pymnt_plan <chr>, purpose <chr>, zip_code <chr>, addr_state <chr>,
## # dti <dbl>, delinq_2yrs <dbl>, earliest_cr_line <chr>,
## # inq_last_6mths <dbl>, mths_since_last_delinq <dbl>, open_acc <dbl>,
## # pub_rec <dbl>, revol_bal <int>, revol_util <dbl>, total_acc <dbl>,
## # loan_defaulted <int>
loans01[is.na(loans01[,"revol_util"]), "revol_util"] <- mean_revol_util
loans01[is.na(loans01[,"annual_inc"]), "annual_inc"] <- mean_annual_inc
#Log transform right skewed features
loans01$annual_inc_log <- log(loans01$annual_inc+1)
loans01$loan_amnt_log <- log(loans01$loan_amnt+1)
loans01$funded_amnt_log <- log(loans01$funded_amnt+1)
loans01$revol_bal_log <- log(loans01$revol_bal+1)
loans01$installment_log <- log(loans01$installment+1)
#New features based on business knowledge/interactions
loans01$funded_loan_amnt_ratio <- (loans01$funded_amnt/loans01$loan_amnt)^3
loans01$installment_inc_ratio <- 12*loans01$installment/loans01$annual_inc
loans01$open_total_acc_ratio <- ifelse(loans01$total_acc==0,0,loans01$open_acc/(loans01$total_acc))
#categorise numeric features
loans01$pub_rec_flag <- as.factor(ifelse(loans01$pub_rec>0,1,loans01$pub_rec))
#new features
#earliest_cr_line
loans01 %>% group_by(earliest_cr_line) %>% summarise(count=n()) %>% arrange(earliest_cr_line)
## # A tibble: 464 x 2
## earliest_cr_line count
## <chr> <int>
## 1 1/01/1963 1
## 2 1/01/1968 2
## 3 1/01/1972 1
## 4 1/01/1973 3
## 5 1/01/1975 1
## 6 1/01/1976 2
## 7 1/01/1977 3
## 8 1/01/1978 3
## 9 1/01/1979 6
## 10 1/01/1980 3
## # ... with 454 more rows
loans01$earliest_cr_line_date <- as.Date(loans01$earliest_cr_line,"%m/%d/%Y",na.action(omit))
loans01$earliest_cr_line_years <- as.numeric(time_length(interval(loans01$earliest_cr_line_date,as.Date("2018-01-01")),"years"))
#impute missing numeric for earliest_cr_years
mean_earliest_cr_line_years <- loans01 %>% select_("earliest_cr_line_years") %>% summarise(mean(earliest_cr_line_years,na.rm = TRUE))
loans01[is.na(loans01[,"earliest_cr_line_years"]), "earliest_cr_line_years"] <- mean_earliest_cr_line_years
loans01 %>% filter(is.na(earliest_cr_line))
## # A tibble: 5 x 37
## loan_amnt funded_amnt term int_rate installment grade sub_grade
## <int> <int> <chr> <chr> <dbl> <chr> <chr>
## 1 3500 3500 36 m… 10.28% 113. C C1
## 2 1000 1000 36 m… 7.12% 30.9 A A1
## 3 2000 2000 36 m… 9.96% 64.5 B B5
## 4 2600 2600 36 m… 8.38% 81.9 A A5
## 5 6700 6700 36 m… 7.75% 209. A A3
## # ... with 30 more variables: emp_length <chr>, home_ownership <chr>,
## # annual_inc <dbl>, verification_status <chr>, pymnt_plan <chr>,
## # purpose <chr>, zip_code <chr>, addr_state <chr>, dti <dbl>,
## # delinq_2yrs <dbl>, earliest_cr_line <chr>, inq_last_6mths <dbl>,
## # mths_since_last_delinq <dbl>, open_acc <dbl>, pub_rec <dbl>,
## # revol_bal <int>, revol_util <dbl>, total_acc <dbl>,
## # loan_defaulted <int>, annual_inc_log <dbl>, loan_amnt_log <dbl>,
## # funded_amnt_log <dbl>, revol_bal_log <dbl>, installment_log <dbl>,
## # funded_loan_amnt_ratio <dbl>, installment_inc_ratio <dbl>,
## # open_total_acc_ratio <dbl>, pub_rec_flag <fct>,
## # earliest_cr_line_date <date>, earliest_cr_line_years <dbl>
head(loans01)
## # A tibble: 6 x 37
## loan_amnt funded_amnt term int_rate installment grade sub_grade
## <int> <int> <chr> <chr> <dbl> <chr> <chr>
## 1 4000 4000 60 m… 7.29% 79.8 A A4
## 2 16000 16000 60 m… 18.25% 408. F F1
## 3 8700 8700 36 m… 7.88% 272. A A5
## 4 18000 18000 60 m… 11.49% 396. B B4
## 5 16000 16000 36 m… 11.83% 530. B B3
## 6 3000 3000 60 m… 15.58% 72.3 D D3
## # ... with 30 more variables: emp_length <chr>, home_ownership <chr>,
## # annual_inc <dbl>, verification_status <chr>, pymnt_plan <chr>,
## # purpose <chr>, zip_code <chr>, addr_state <chr>, dti <dbl>,
## # delinq_2yrs <dbl>, earliest_cr_line <chr>, inq_last_6mths <dbl>,
## # mths_since_last_delinq <dbl>, open_acc <dbl>, pub_rec <dbl>,
## # revol_bal <int>, revol_util <dbl>, total_acc <dbl>,
## # loan_defaulted <int>, annual_inc_log <dbl>, loan_amnt_log <dbl>,
## # funded_amnt_log <dbl>, revol_bal_log <dbl>, installment_log <dbl>,
## # funded_loan_amnt_ratio <dbl>, installment_inc_ratio <dbl>,
## # open_total_acc_ratio <dbl>, pub_rec_flag <fct>,
## # earliest_cr_line_date <date>, earliest_cr_line_years <dbl>
loans01 %>% group_by(term) %>% summarise(count=n())
## # A tibble: 2 x 2
## term count
## <chr> <int>
## 1 36 months 7441
## 2 60 months 2559
loans01 %>% group_by(purpose) %>% summarise(count=n())
## # A tibble: 14 x 2
## purpose count
## <chr> <int>
## 1 car 352
## 2 credit_card 1290
## 3 debt_consolidation 4556
## 4 educational 112
## 5 home_improvement 811
## 6 house 97
## 7 major_purchase 551
## 8 medical 185
## 9 moving 162
## 10 other 1044
## 11 renewable_energy 19
## 12 small_business 473
## 13 vacation 96
## 14 wedding 252
#convert to integer/numeric
loans01 %>% group_by(int_rate) %>% summarise(count=n())
## # A tibble: 359 x 2
## int_rate count
## <chr> <int>
## 1 10.00% 56
## 2 10.01% 4
## 3 10.08% 15
## 4 10.14% 1
## 5 10.20% 5
## 6 10.25% 58
## 7 10.28% 7
## 8 10.33% 1
## 9 10.36% 68
## 10 10.37% 117
## # ... with 349 more rows
loans01$int_rate_num <- as.numeric(gsub("%","",loans01$int_rate))
loans01 %>% group_by(emp_length) %>% summarise(count=n()) %>% arrange(desc(count))
## # A tibble: 12 x 2
## emp_length count
## <chr> <int>
## 1 10+ years 2185
## 2 < 1 year 1183
## 3 2 years 1147
## 4 3 years 1017
## 5 4 years 899
## 6 1 year 872
## 7 5 years 788
## 8 6 years 539
## 9 7 years 424
## 10 8 years 353
## 11 9 years 334
## 12 n/a 259
loans01$emp_length_num <- as.numeric(gsub("\\D","",loans01$emp_length))
loans01 %>% filter(emp_length=="n/a") %>% select(emp_length,emp_length_num,annual_inc,loan_defaulted)
## # A tibble: 259 x 4
## emp_length emp_length_num annual_inc loan_defaulted
## <chr> <dbl> <dbl> <int>
## 1 n/a NA 55200 0
## 2 n/a NA 63000 0
## 3 n/a NA 20000 1
## 4 n/a NA 12360 0
## 5 n/a NA 30000 0
## 6 n/a NA 30000 0
## 7 n/a NA 28000 0
## 8 n/a NA 61000 0
## 9 n/a NA 29736 0
## 10 n/a NA 66000 0
## # ... with 249 more rows
#impute missing numeric for emp_length
mean_emp_length <- loans01 %>% select_("emp_length_num") %>% summarise(mean(emp_length_num,na.rm = TRUE))
loans01[is.na(loans01[,"emp_length_num"]), "emp_length_num"] <- mean_emp_length
#target encode these during modelling on h2o - too many levels !!!!
loans01 %>% group_by(sub_grade) %>% summarise(count=n())
## # A tibble: 35 x 2
## sub_grade count
## <chr> <int>
## 1 A1 250
## 2 A2 389
## 3 A3 407
## 4 A4 715
## 5 A5 688
## 6 B1 450
## 7 B2 490
## 8 B3 742
## 9 B4 606
## 10 B5 664
## # ... with 25 more rows
loans01 %>% group_by(addr_state) %>% summarise(count=n())
## # A tibble: 50 x 2
## addr_state count
## <chr> <int>
## 1 AK 26
## 2 AL 119
## 3 AR 48
## 4 AZ 210
## 5 CA 1748
## 6 CO 175
## 7 CT 195
## 8 DC 54
## 9 DE 35
## 10 FL 714
## # ... with 40 more rows
loans01 %>% group_by(zip_code) %>% summarise(count=n())
## # A tibble: 720 x 2
## zip_code count
## <chr> <int>
## 1 007xx 1
## 2 010xx 32
## 3 011xx 5
## 4 012xx 2
## 5 013xx 7
## 6 014xx 12
## 7 015xx 11
## 8 016xx 4
## 9 017xx 10
## 10 018xx 33
## # ... with 710 more rows
loans01 %>% group_by(pymnt_plan) %>% summarise(count=n(), default=mean(loan_defaulted))
## # A tibble: 2 x 3
## pymnt_plan count default
## <chr> <int> <dbl>
## 1 n 9998 0.129
## 2 y 2 1
loans01 %>% group_by(verification_status) %>% summarise(count=n(), default=mean(loan_defaulted))
## # A tibble: 3 x 3
## verification_status count default
## <chr> <int> <dbl>
## 1 not verified 4367 0.110
## 2 VERIFIED - income 3214 0.151
## 3 VERIFIED - income source 2419 0.136
#clean up verification status
loans01$verification_status_income <- as.factor(ifelse(loans01$verification_status=="not verified",0,1))
loans01 %>% group_by(verification_status_income) %>% summarise(count=n(), default=mean(loan_defaulted))
## # A tibble: 2 x 3
## verification_status_income count default
## <fct> <int> <dbl>
## 1 0 4367 0.110
## 2 1 5633 0.144
loans01 <- loans01 %>% mutate_if(sapply(loans01,is.character),as.factor)
sapply(loans01,class)
## loan_amnt funded_amnt
## "integer" "integer"
## term int_rate
## "factor" "factor"
## installment grade
## "numeric" "factor"
## sub_grade emp_length
## "factor" "factor"
## home_ownership annual_inc
## "factor" "numeric"
## verification_status pymnt_plan
## "factor" "factor"
## purpose zip_code
## "factor" "factor"
## addr_state dti
## "factor" "numeric"
## delinq_2yrs earliest_cr_line
## "numeric" "factor"
## inq_last_6mths mths_since_last_delinq
## "numeric" "numeric"
## open_acc pub_rec
## "numeric" "numeric"
## revol_bal revol_util
## "integer" "numeric"
## total_acc loan_defaulted
## "numeric" "integer"
## annual_inc_log loan_amnt_log
## "numeric" "numeric"
## funded_amnt_log revol_bal_log
## "numeric" "numeric"
## installment_log funded_loan_amnt_ratio
## "numeric" "numeric"
## installment_inc_ratio open_total_acc_ratio
## "numeric" "numeric"
## pub_rec_flag earliest_cr_line_date
## "factor" "Date"
## earliest_cr_line_years int_rate_num
## "numeric" "numeric"
## emp_length_num verification_status_income
## "numeric" "factor"
head(loans01)
## # A tibble: 6 x 40
## loan_amnt funded_amnt term int_rate installment grade sub_grade
## <int> <int> <fct> <fct> <dbl> <fct> <fct>
## 1 4000 4000 60 m… 7.29% 79.8 A A4
## 2 16000 16000 60 m… 18.25% 408. F F1
## 3 8700 8700 36 m… 7.88% 272. A A5
## 4 18000 18000 60 m… 11.49% 396. B B4
## 5 16000 16000 36 m… 11.83% 530. B B3
## 6 3000 3000 60 m… 15.58% 72.3 D D3
## # ... with 33 more variables: emp_length <fct>, home_ownership <fct>,
## # annual_inc <dbl>, verification_status <fct>, pymnt_plan <fct>,
## # purpose <fct>, zip_code <fct>, addr_state <fct>, dti <dbl>,
## # delinq_2yrs <dbl>, earliest_cr_line <fct>, inq_last_6mths <dbl>,
## # mths_since_last_delinq <dbl>, open_acc <dbl>, pub_rec <dbl>,
## # revol_bal <int>, revol_util <dbl>, total_acc <dbl>,
## # loan_defaulted <int>, annual_inc_log <dbl>, loan_amnt_log <dbl>,
## # funded_amnt_log <dbl>, revol_bal_log <dbl>, installment_log <dbl>,
## # funded_loan_amnt_ratio <dbl>, installment_inc_ratio <dbl>,
## # open_total_acc_ratio <dbl>, pub_rec_flag <fct>,
## # earliest_cr_line_date <date>, earliest_cr_line_years <dbl>,
## # int_rate_num <dbl>, emp_length_num <dbl>,
## # verification_status_income <fct>
char_var_names<- loans01 %>%
select_if(is.factor) %>%
names() %>%
as.list()
char_var_names
## [[1]]
## [1] "term"
##
## [[2]]
## [1] "int_rate"
##
## [[3]]
## [1] "grade"
##
## [[4]]
## [1] "sub_grade"
##
## [[5]]
## [1] "emp_length"
##
## [[6]]
## [1] "home_ownership"
##
## [[7]]
## [1] "verification_status"
##
## [[8]]
## [1] "pymnt_plan"
##
## [[9]]
## [1] "purpose"
##
## [[10]]
## [1] "zip_code"
##
## [[11]]
## [1] "addr_state"
##
## [[12]]
## [1] "earliest_cr_line"
##
## [[13]]
## [1] "pub_rec_flag"
##
## [[14]]
## [1] "verification_status_income"
for (i in seq(char_var_names)){
g <- ggplot(loans01,aes_string(char_var_names[[i]])) + geom_bar(aes(fill=as.factor(loan_defaulted))) + coord_flip()
print(g)
}
for (i in seq(char_var_names)){
g <- ggplot(loans01,aes_string(char_var_names[[i]])) + geom_bar(aes(fill=as.factor(loan_defaulted)),position="fill") +
ylab("Proportion per category") + coord_flip()
print(g)
}
num_var_names<- loans01 %>%
select_if(is.numeric) %>%
select(-loan_defaulted) %>%
names() %>%
as.list()
num_var_names
## [[1]]
## [1] "loan_amnt"
##
## [[2]]
## [1] "funded_amnt"
##
## [[3]]
## [1] "installment"
##
## [[4]]
## [1] "annual_inc"
##
## [[5]]
## [1] "dti"
##
## [[6]]
## [1] "delinq_2yrs"
##
## [[7]]
## [1] "inq_last_6mths"
##
## [[8]]
## [1] "mths_since_last_delinq"
##
## [[9]]
## [1] "open_acc"
##
## [[10]]
## [1] "pub_rec"
##
## [[11]]
## [1] "revol_bal"
##
## [[12]]
## [1] "revol_util"
##
## [[13]]
## [1] "total_acc"
##
## [[14]]
## [1] "annual_inc_log"
##
## [[15]]
## [1] "loan_amnt_log"
##
## [[16]]
## [1] "funded_amnt_log"
##
## [[17]]
## [1] "revol_bal_log"
##
## [[18]]
## [1] "installment_log"
##
## [[19]]
## [1] "funded_loan_amnt_ratio"
##
## [[20]]
## [1] "installment_inc_ratio"
##
## [[21]]
## [1] "open_total_acc_ratio"
##
## [[22]]
## [1] "earliest_cr_line_years"
##
## [[23]]
## [1] "int_rate_num"
##
## [[24]]
## [1] "emp_length_num"
for (i in seq(num_var_names)){
g <- ggplot(loans01,aes_string(num_var_names[[i]])) + geom_density(aes(color=as.factor(loan_defaulted)))
print(g)
}
#correlation plot
#correlation coeff. only
corr <- loans01 %>%
select_if(is.numeric) %>%
#select(-loan_defaulted) %>%
cor(use="complete.obs",method="pearson") %>%
round(2) %>%
as.matrix() %>%
melt()
ggplot(data = corr, aes(x=Var1, y=Var2, fill=value)) + geom_tile() +
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 8, hjust = 1)) +
coord_fixed()
#p-values for the correlation coeff.
#loans01 %>%
#select_if(is.numeric) %>%
#as.matrix() %>%
#rcorr(type="pearson")
summary(loans01)
## loan_amnt funded_amnt term int_rate
## Min. : 500 Min. : 500 36 months:7441 10.99% : 240
## 1st Qu.: 5000 1st Qu.: 5000 60 months:2559 11.49% : 207
## Median : 9600 Median : 9250 13.49% : 196
## Mean :11036 Mean :10766 7.88% : 185
## 3rd Qu.:15000 3rd Qu.:15000 7.51% : 180
## Max. :35000 Max. :35000 7.49% : 162
## (Other):8830
## installment grade sub_grade emp_length
## Min. : 15.69 A:2449 B3 : 742 10+ years:2185
## 1st Qu.: 163.47 B:2952 A4 : 715 < 1 year :1183
## Median : 275.11 C:2005 A5 : 688 2 years :1147
## Mean : 321.43 D:1369 B5 : 664 3 years :1017
## 3rd Qu.: 427.04 E: 769 B4 : 606 4 years : 899
## Max. :1276.60 F: 327 C1 : 516 1 year : 872
## G: 129 (Other):6069 (Other) :2697
## home_ownership annual_inc verification_status
## MORTGAGE:4445 Min. : 2000 not verified :4367
## NONE : 1 1st Qu.: 40000 VERIFIED - income :3214
## OTHER : 34 Median : 58000 VERIFIED - income source:2419
## OWN : 775 Mean : 68203
## RENT :4745 3rd Qu.: 82000
## Max. :900000
##
## pymnt_plan purpose zip_code addr_state
## n:9998 debt_consolidation:4556 100xx : 158 CA :1748
## y: 2 credit_card :1290 112xx : 141 NY : 958
## other :1044 945xx : 129 FL : 714
## home_improvement : 811 070xx : 125 TX : 700
## major_purchase : 551 606xx : 114 NJ : 482
## small_business : 473 900xx : 107 VA : 392
## (Other) :1275 (Other):9226 (Other):5006
## dti delinq_2yrs earliest_cr_line inq_last_6mths
## Min. : 0.00 Min. : 0.0000 11/01/1998: 95 Min. : 0.000
## 1st Qu.: 8.16 1st Qu.: 0.0000 11/01/2000: 95 1st Qu.: 0.000
## Median :13.41 Median : 0.0000 12/01/1998: 91 Median : 1.000
## Mean :13.34 Mean : 0.1481 10/01/1998: 86 Mean : 1.066
## 3rd Qu.:18.69 3rd Qu.: 0.0000 10/01/1999: 84 3rd Qu.: 2.000
## Max. :29.99 Max. :11.0000 (Other) :9544 Max. :25.000
## NA's : 5
## mths_since_last_delinq open_acc pub_rec revol_bal
## Min. : 0.00 Min. : 0.00 Min. :0.0000 Min. : 0
## 1st Qu.: 0.00 1st Qu.: 6.00 1st Qu.:0.0000 1st Qu.: 3524
## Median : 0.00 Median : 9.00 Median :0.0000 Median : 8646
## Mean : 13.22 Mean : 9.33 Mean :0.0601 Mean : 14271
## 3rd Qu.: 23.00 3rd Qu.:12.00 3rd Qu.:0.0000 3rd Qu.: 16952
## Max. :120.00 Max. :39.00 Max. :3.0000 Max. :1207359
##
## revol_util total_acc loan_defaulted annual_inc_log
## Min. : 0.00 Min. : 0.00 Min. :0.0000 Min. : 7.601
## 1st Qu.: 25.00 1st Qu.:13.00 1st Qu.:0.0000 1st Qu.:10.597
## Median : 48.60 Median :20.00 Median :0.0000 Median :10.968
## Mean : 48.47 Mean :22.08 Mean :0.1295 Mean :10.962
## 3rd Qu.: 71.80 3rd Qu.:29.00 3rd Qu.:0.0000 3rd Qu.:11.314
## Max. :108.80 Max. :90.00 Max. :1.0000 Max. :13.710
##
## loan_amnt_log funded_amnt_log revol_bal_log installment_log
## Min. : 6.217 Min. : 6.217 Min. : 0.000 Min. :2.815
## 1st Qu.: 8.517 1st Qu.: 8.517 1st Qu.: 8.168 1st Qu.:5.103
## Median : 9.170 Median : 9.132 Median : 9.065 Median :5.621
## Mean : 9.063 Mean : 9.045 Mean : 8.665 Mean :5.549
## 3rd Qu.: 9.616 3rd Qu.: 9.616 3rd Qu.: 9.738 3rd Qu.:6.059
## Max. :10.463 Max. :10.463 Max. :14.004 Max. :7.153
##
## funded_loan_amnt_ratio installment_inc_ratio open_total_acc_ratio
## Min. :0.002326 Min. :0.001437 Min. :0.0000
## 1st Qu.:1.000000 1st Qu.:0.034832 1st Qu.:0.3333
## Median :1.000000 Median :0.057328 Median :0.4444
## Mean :0.970474 Mean :0.065452 Mean :0.4738
## 3rd Qu.:1.000000 3rd Qu.:0.088175 3rd Qu.:0.5882
## Max. :1.000000 Max. :0.435456 Max. :1.0000
##
## pub_rec_flag earliest_cr_line_date earliest_cr_line_years int_rate_num
## 0:9427 Min. :1950-12-01 Min. : 9.167 Min. : 5.42
## 1: 573 1st Qu.:1994-01-01 1st Qu.:16.252 1st Qu.: 9.45
## Median :1998-06-01 Median :19.586 Median :11.89
## Mean :1997-02-24 Mean :20.849 Mean :12.12
## 3rd Qu.:2001-10-01 3rd Qu.:24.000 3rd Qu.:14.65
## Max. :2008-11-01 Max. :67.085 Max. :23.91
## NA's :5
## emp_length_num verification_status_income
## Min. : 1.000 0:4367
## 1st Qu.: 2.000 1:5633
## Median : 4.000
## Mean : 5.012
## 3rd Qu.: 9.000
## Max. :10.000
##
loans01 <- loans01 %>%
select(-verification_status) %>%
select(-pymnt_plan) %>%
select(-earliest_cr_line) %>%
select(-earliest_cr_line_date) %>%
select(-pub_rec)
loans01 <- loans01 %>% select(-zip_code)
loans01$emp_length_na <- as.factor(ifelse(loans01$emp_length=="n/a",1,0))
head(loans01)
## # A tibble: 6 x 35
## loan_amnt funded_amnt term int_rate installment grade sub_grade
## <int> <int> <fct> <fct> <dbl> <fct> <fct>
## 1 4000 4000 60 m… 7.29% 79.8 A A4
## 2 16000 16000 60 m… 18.25% 408. F F1
## 3 8700 8700 36 m… 7.88% 272. A A5
## 4 18000 18000 60 m… 11.49% 396. B B4
## 5 16000 16000 36 m… 11.83% 530. B B3
## 6 3000 3000 60 m… 15.58% 72.3 D D3
## # ... with 28 more variables: emp_length <fct>, home_ownership <fct>,
## # annual_inc <dbl>, purpose <fct>, addr_state <fct>, dti <dbl>,
## # delinq_2yrs <dbl>, inq_last_6mths <dbl>, mths_since_last_delinq <dbl>,
## # open_acc <dbl>, revol_bal <int>, revol_util <dbl>, total_acc <dbl>,
## # loan_defaulted <int>, annual_inc_log <dbl>, loan_amnt_log <dbl>,
## # funded_amnt_log <dbl>, revol_bal_log <dbl>, installment_log <dbl>,
## # funded_loan_amnt_ratio <dbl>, installment_inc_ratio <dbl>,
## # open_total_acc_ratio <dbl>, pub_rec_flag <fct>,
## # earliest_cr_line_years <dbl>, int_rate_num <dbl>,
## # emp_length_num <dbl>, verification_status_income <fct>,
## # emp_length_na <fct>
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:lubridate':
##
## day, hour, month, week, year
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
#h2o.shutdown()
h2o.init(nthreads=-1, max_mem_size = '2g')
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 12 hours 52 minutes
## H2O cluster timezone: Australia/Sydney
## H2O data parsing timezone: UTC
## H2O cluster version: 3.20.0.5
## H2O cluster version age: 26 days
## H2O cluster name: H2O_started_from_R_praneesh_kvn157
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.92 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: XGBoost, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.5.1 (2018-07-02)
# Training data: Separate into x and y tibbles
x_train_tbl <- loans01 %>% select(-loan_defaulted)
y_train_tbl <- loans01 %>% select(loan_defaulted)
y_train_tbl <- y_train_tbl %>%
mutate(loan_defaulted = loan_defaulted %>% as.character() %>% as.factor())
head(x_train_tbl)
## # A tibble: 6 x 34
## loan_amnt funded_amnt term int_rate installment grade sub_grade
## <int> <int> <fct> <fct> <dbl> <fct> <fct>
## 1 4000 4000 60 m… 7.29% 79.8 A A4
## 2 16000 16000 60 m… 18.25% 408. F F1
## 3 8700 8700 36 m… 7.88% 272. A A5
## 4 18000 18000 60 m… 11.49% 396. B B4
## 5 16000 16000 36 m… 11.83% 530. B B3
## 6 3000 3000 60 m… 15.58% 72.3 D D3
## # ... with 27 more variables: emp_length <fct>, home_ownership <fct>,
## # annual_inc <dbl>, purpose <fct>, addr_state <fct>, dti <dbl>,
## # delinq_2yrs <dbl>, inq_last_6mths <dbl>, mths_since_last_delinq <dbl>,
## # open_acc <dbl>, revol_bal <int>, revol_util <dbl>, total_acc <dbl>,
## # annual_inc_log <dbl>, loan_amnt_log <dbl>, funded_amnt_log <dbl>,
## # revol_bal_log <dbl>, installment_log <dbl>,
## # funded_loan_amnt_ratio <dbl>, installment_inc_ratio <dbl>,
## # open_total_acc_ratio <dbl>, pub_rec_flag <fct>,
## # earliest_cr_line_years <dbl>, int_rate_num <dbl>,
## # emp_length_num <dbl>, verification_status_income <fct>,
## # emp_length_na <fct>
head(y_train_tbl)
## # A tibble: 6 x 1
## loan_defaulted
## <fct>
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
data_h2o <- as.h2o(bind_cols(y_train_tbl, x_train_tbl))
##
|
| | 0%
|
|=================================================================| 100%
splits_h2o <- h2o.splitFrame(data_h2o, ratios = c(0.9), seed = 1234)
train_h2o <- splits_h2o[[1]]
te_holdout_h2o <- splits_h2o[[2]]
te_cols<- c("sub_grade","addr_state")
te_map <- h2o.target_encode_create(te_holdout_h2o, x = as.list(te_cols), y = "loan_defaulted")
te_map$sub_grade
## sub_grade numerator denominator
## 1 A1 0 19
## 2 A2 0 42
## 3 A3 2 37
## 4 A4 4 78
## 5 A5 7 69
## 6 B1 5 39
##
## [35 rows x 3 columns]
te_map$addr_state
## addr_state numerator denominator
## 1 AK 1 4
## 2 AL 3 10
## 3 AR 0 7
## 4 AZ 2 19
## 5 CA 24 173
## 6 CO 5 17
##
## [43 rows x 3 columns]
Holdout_type is none as there is a separate te_holdout set.
ext_train_h2o <- h2o.target_encode_apply(train_h2o, x = as.list(te_cols), y = "loan_defaulted",
target_encode_map = te_map,
holdout_type = "None",
noise_level = 0,
seed = 1234
)
head(ext_train_h2o[c("sub_grade", "TargetEncode_sub_grade")])
## sub_grade TargetEncode_sub_grade
## 1 A3 0.06406758
## 2 A3 0.06406758
## 3 A4 0.05148609
## 4 A5 0.10157812
## 5 A5 0.10157812
## 6 B2 0.07239862
head(ext_train_h2o[c("addr_state", "TargetEncode_addr_state")])
## addr_state TargetEncode_addr_state
## 1 AK 0.1409067
## 2 AK 0.1409067
## 3 AK 0.1409067
## 4 AK 0.1409067
## 5 AK 0.1409067
## 6 AK 0.1409067
str(ext_train_h2o)
## Class 'H2OFrame' <environment: 0x7f997cf53428>
## - attr(*, "op")= chr "colnames="
## - attr(*, "eval")= logi TRUE
## - attr(*, "id")= chr "RTMP_sid_80a3_39"
## - attr(*, "nrow")= int 8999
## - attr(*, "ncol")= int 37
## - attr(*, "types")=List of 37
## ..$ : chr "enum"
## ..$ : chr "enum"
## ..$ : chr "enum"
## ..$ : chr "int"
## ..$ : chr "int"
## ..$ : chr "enum"
## ..$ : chr "enum"
## ..$ : chr "real"
## ..$ : chr "enum"
## ..$ : chr "enum"
## ..$ : chr "enum"
## ..$ : chr "real"
## ..$ : chr "enum"
## ..$ : chr "real"
## ..$ : chr "int"
## ..$ : chr "int"
## ..$ : chr "int"
## ..$ : chr "int"
## ..$ : chr "int"
## ..$ : chr "real"
## ..$ : chr "int"
## ..$ : chr "real"
## ..$ : chr "real"
## ..$ : chr "real"
## ..$ : chr "real"
## ..$ : chr "real"
## ..$ : chr "real"
## ..$ : chr "real"
## ..$ : chr "real"
## ..$ : chr "enum"
## ..$ : chr "real"
## ..$ : chr "real"
## ..$ : chr "real"
## ..$ : chr "enum"
## ..$ : chr "enum"
## ..$ : chr "real"
## ..$ : chr "real"
## - attr(*, "data")='data.frame': 10 obs. of 37 variables:
## ..$ addr_state : Factor w/ 50 levels "AK","AL","AR",..: 1 1 1 1 1 1 1 1 1 1
## ..$ sub_grade : Factor w/ 35 levels "A1","A2","A3",..: 3 3 4 5 5 7 8 9 10 11
## ..$ loan_defaulted : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1
## ..$ loan_amnt : num 8400 12400 3000 4000 15000 6200 8000 1000 23000 5000
## ..$ funded_amnt : num 8400 12400 3000 4000 15000 6200 8000 1000 23000 5000
## ..$ term : Factor w/ 2 levels "36 months","60 months": 1 1 1 1 1 1 2 1 2 1
## ..$ int_rate : Factor w/ 359 levels "10.00%","10.01%",..: 314 321 327 338 326 17 349 51 72 78
## ..$ installment : num 259.9 385.8 93.9 127 469.2 ...
## ..$ grade : Factor w/ 7 levels "A","B","C","D",..: 1 1 1 1 1 2 2 2 2 3
## ..$ emp_length : Factor w/ 12 levels "1 year","10+ years",..: 5 2 5 3 2 2 5 2 2 11
## ..$ home_ownership : Factor w/ 5 levels "MORTGAGE","NONE",..: 1 1 5 1 1 1 5 1 1 5
## ..$ annual_inc : num 120000 36800 45000 112000 78000 24000 55200 87000 80000 36000
## ..$ purpose : Factor w/ 14 levels "car","credit_card",..: 7 3 3 5 3 2 3 10 3 10
## ..$ dti : num 4.1 5.84 19.39 3.19 11.09 ...
## ..$ delinq_2yrs : num 0 0 0 0 0 0 0 1 0 0
## ..$ inq_last_6mths : num 0 0 0 0 0 1 1 0 1 3
## ..$ mths_since_last_delinq : num 0 0 0 72 0 0 0 13 0 0
## ..$ open_acc : num 18 5 10 5 7 10 6 5 14 9
## ..$ revol_bal : num 8761 6975 16487 554 40665 ...
## ..$ revol_util : num 8.7 46.8 63.2 39.6 61.3 55.2 15.2 55.2 60.3 82
## ..$ total_acc : num 48 15 15 12 16 11 21 18 54 12
## ..$ annual_inc_log : num 11.7 10.5 10.7 11.6 11.3 ...
## ..$ loan_amnt_log : num 9.04 9.43 8.01 8.29 9.62 ...
## ..$ funded_amnt_log : num 9.04 9.43 8.01 8.29 9.62 ...
## ..$ revol_bal_log : num 9.08 8.85 9.71 6.32 10.61 ...
## ..$ installment_log : num 5.56 5.96 4.55 4.85 6.15 ...
## ..$ funded_loan_amnt_ratio : num 1 1 1 1 1 1 1 1 1 1
## ..$ installment_inc_ratio : num 0.026 0.1258 0.025 0.0136 0.0722 ...
## ..$ open_total_acc_ratio : num 0.375 0.333 0.667 0.417 0.438 ...
## ..$ pub_rec_flag : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 1
## ..$ earliest_cr_line_years : num 29.7 13.9 16.6 15.3 24.3 ...
## ..$ int_rate_num : num 7.14 7.51 7.9 8.9 7.88 ...
## ..$ emp_length_num : num 4 10 4 2 10 10 4 10 10 1
## ..$ verification_status_income: Factor w/ 2 levels "0","1": 1 1 1 2 2 1 2 1 2 1
## ..$ emp_length_na : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1
## ..$ TargetEncode_sub_grade : num 0.0641 0.0641 0.0515 0.1016 0.1016 ...
## ..$ TargetEncode_addr_state : num 0.141 0.141 0.141 0.141 0.141 ...
y <- "loan_defaulted"
x <- setdiff(names(train_h2o), c(y, te_cols))
tic("Model_Training")
automl_models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
nfolds = 5,
#max_models = 10, #stop when either max of # models built or time runs out.
max_runtime_secs = 90
# Early Stopping
#stopping_rounds = 5, stopping_metric = "AUC",
#stopping_tolerance = 0.001, seed = 1234
)
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|==== | 7%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|====== | 10%
|
|======= | 11%
|
|======= | 12%
|
|======== | 13%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 16%
|
|=========== | 17%
|
|=========== | 18%
|
|============ | 18%
|
|============ | 19%
|
|============= | 19%
|
|============= | 20%
|
|============= | 21%
|
|============== | 21%
|
|============== | 22%
|
|=============== | 22%
|
|=============== | 23%
|
|=============== | 24%
|
|================ | 24%
|
|================ | 25%
|
|================= | 26%
|
|================== | 27%
|
|================== | 28%
|
|=================== | 29%
|
|=================== | 30%
|
|==================== | 30%
|
|============================================================== | 95%
|
|=================================================================| 100%
toc()
## Model_Training: 108.369 sec elapsed
automl_leader <- automl_models_h2o@leader